{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense,Activation\n",
    "from keras.layers.recurrent import SimpleRNN\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fin=open('/home/akishore/alice.txt',encoding='utf-8-sig')\n",
    "lines=[]\n",
    "for line in fin:\n",
    "  line = line.strip().lower()\n",
    "  #line = line.decode(\"ascii\",\"ignore\")\n",
    "  if(len(line)==0):\n",
    "    continue\n",
    "  lines.append(line)\n",
    "fin.close()\n",
    "text = \" \".join(lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text[:100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "text = text.lower()\n",
    "text = re.sub('[^0-9a-zA-Z]+',' ',text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "counts = Counter()\n",
    "counts.update(text.split())\n",
    "words = sorted(counts, key=counts.get, reverse=True)\n",
    "chars = words\n",
    "nb_chars = len(text.split())\n",
    "char2index = {word: i for i, word in enumerate(chars)}\n",
    "index2char = {i: word for i, word in enumerate(chars)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by']\n",
      "her\n"
     ]
    }
   ],
   "source": [
    "SEQLEN = 10\n",
    "STEP = 1\n",
    "input_chars = []\n",
    "label_chars = []\n",
    "text2=text.split()\n",
    "for i in range(0,nb_chars-SEQLEN,STEP):\n",
    "    x=text2[i:(i+SEQLEN)]\n",
    "    y=text2[i+SEQLEN]\n",
    "    input_chars.append(x)\n",
    "    label_chars.append(y)\n",
    "print(input_chars[0])\n",
    "print(label_chars[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3028"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "total_chars = len(set(chars))\n",
    "total_chars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(30407, 10, 3028)\n",
      "(30407, 3028)\n"
     ]
    }
   ],
   "source": [
    "X = np.zeros((len(input_chars), SEQLEN, total_chars), dtype=np.bool)\n",
    "y = np.zeros((len(input_chars), total_chars), dtype=np.bool)\n",
    "# Create encoded vectors for the input and output values\n",
    "for i, input_char in enumerate(input_chars):\n",
    "    for j, ch in enumerate(input_char):\n",
    "        X[i, j, char2index[ch]] = 1\n",
    "    y[i,char2index[label_chars[i]]]=1\n",
    "print(X.shape)\n",
    "print(y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "HIDDEN_SIZE = 128\n",
    "BATCH_SIZE = 1024\n",
    "NUM_ITERATIONS = 100\n",
    "NUM_EPOCHS_PER_ITERATION = 1\n",
    "NUM_PREDS_PER_EPOCH = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "simple_rnn_2 (SimpleRNN)     (None, 128)               404096    \n",
      "_________________________________________________________________\n",
      "dense_2 (Dense)              (None, 3028)              390612    \n",
      "=================================================================\n",
      "Total params: 794,708\n",
      "Trainable params: 794,708\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = Sequential()\n",
    "model.add(SimpleRNN(HIDDEN_SIZE,return_sequences=False,input_shape=(SEQLEN,total_chars),unroll=True))\n",
    "model.add(Dense(total_chars, activation='sigmoid'))\n",
    "model.compile(optimizer='rmsprop', loss='categorical_crossentropy')\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "==================================================\n",
      "Iteration #: 0\n",
      "Epoch 1/1\n",
      "18432/30407 [=================>............] - ETA: 27s - loss: 7.2330"
     ]
    }
   ],
   "source": [
    "for iteration in range(50):\n",
    "    print(\"=\" * 50)\n",
    "    print(\"Iteration #: %d\" % (iteration))\n",
    "    # Fitting the values\n",
    "    model.fit(X, y, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS_PER_ITERATION)\n",
    "\n",
    "    # Time to see how our predictions fare\n",
    "    # We are creating a test set from a random location in our dataset\n",
    "    # In the code below, we are selecting a random input as our seed value of words\n",
    "    test_idx = np.random.randint(len(input_chars))\n",
    "    test_chars = input_chars[test_idx]\n",
    "    print(\"Generating from seed: %s\" % (test_chars))\n",
    "    print(test_chars)\n",
    "    # From the seed words, we are tasked to predict the next words\n",
    "    # In the code below, we are predicting the next 100 words (NUM_PREDS_PER_EPOCH) after the seed words\n",
    "    for i in range(NUM_PREDS_PER_EPOCH):\n",
    "        # Pre processing the input data, just like the way we did before training the model\n",
    "        Xtest = np.zeros((1, SEQLEN, total_chars))\n",
    "        for i, ch in enumerate(test_chars):\n",
    "            Xtest[0, i, char2index[ch]] = 1\n",
    "        # Predict the next word\n",
    "        pred = model.predict(Xtest, verbose=0)[0]\n",
    "        # Given that, the predictions are probability values, we take the argmax to fetch the location of highest probability\n",
    "        # Extract the word belonging to argmax\n",
    "        ypred = index2char[np.argmax(pred)]\n",
    "        print(ypred)\n",
    "        # move forward with test_chars + ypred so that we use the original 9 words + prediction for the next prediction\n",
    "        test_chars = test_chars[1:] + [ypred]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [anaconda3]",
   "language": "python",
   "name": "Python [anaconda3]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
